# load the packages
suppressMessages({
library(data.table)
library(dplyr)
library(tidyverse)
library(psych)
library(scales)
library(lubridate)
library(ggplot2)
library(hrbrthemes)
library(plotly)
library(forecast)
library(prophet)
})
set.seed(42)
calendar.csv - Contains information about the dates on which the products are sold. sell_prices.csv - Contains information about the price of the products sold per store and date. sales_train_validation.csv - Contains the historical daily unit sales data per product and store [d_1 - d_1913].
# read the data
calendar <- fread('data/calendar.csv')
price <- fread('data/sell_prices.csv')
sales <- fread('data/sales_train_validation.csv')
head(calendar)
## date wm_yr_wk weekday wday month year d event_name_1
## 1: 2011-01-29 11101 Saturday 1 1 2011 d_1
## 2: 2011-01-30 11101 Sunday 2 1 2011 d_2
## 3: 2011-01-31 11101 Monday 3 1 2011 d_3
## 4: 2011-02-01 11101 Tuesday 4 2 2011 d_4
## 5: 2011-02-02 11101 Wednesday 5 2 2011 d_5
## 6: 2011-02-03 11101 Thursday 6 2 2011 d_6
## event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 1 1 0
## 5: 1 0 1
## 6: 1 1 1
head(price)
## store_id item_id wm_yr_wk sell_price
## 1: CA_1 HOBBIES_1_001 11325 9.58
## 2: CA_1 HOBBIES_1_001 11326 9.58
## 3: CA_1 HOBBIES_1_001 11327 8.26
## 4: CA_1 HOBBIES_1_001 11328 8.26
## 5: CA_1 HOBBIES_1_001 11329 8.26
## 6: CA_1 HOBBIES_1_001 11330 8.26
# train test split
basic <- select(sales, 1:6)
date <- select(sales, -1:-6)
date_num <- dim(date)[2]
test_size <- 28
train_size <- date_num - 28
train <- select(date, 1:all_of(train_size))
test <- select(date, (all_of(train_size)+1):all_of(date_num))
train <- cbind(basic, train)
test <- cbind(basic, test)
The data range from 2011-01-29 to 2016-06-19.
calendar$date <- as.Date(calendar$date, format='%Y-%m-%d')
calendar %>% summarize(min_date=min(date), max_date=max(date))
## min_date max_date
## 1 2011-01-29 2016-06-19
The data range for the training data set is from 2011-01-29 to 2016-03-27, and the data range for the test set is from 2016-03-28 to 2016-04-24.
train_date <- calendar$date[1:train_size]
test_date <- calendar$date[(train_size+1):(train_size+28)]
min(train_date)
## [1] "2011-01-29"
max(train_date)
## [1] "2016-03-27"
min(test_date)
## [1] "2016-03-28"
max(test_date)
## [1] "2016-04-24"
In our dataset, we have 3 categories of 3049 items in 7 departments. Those items were sold in 10 stores of 3 states.
# count unique item, dept, category, store, and state
length(unique(train[['item_id']]))
## [1] 3049
length(unique(train[['dept_id']]))
## [1] 7
length(unique(train[['cat_id']]))
## [1] 3
length(unique(train[['store_id']]))
## [1] 10
length(unique(train[['state_id']]))
## [1] 3
(top_item <- head(count(train, item_id, sort = TRUE), 5))
## # A tibble: 5 x 2
## item_id n
## <chr> <int>
## 1 FOODS_1_001 10
## 2 FOODS_1_002 10
## 3 FOODS_1_003 10
## 4 FOODS_1_004 10
## 5 FOODS_1_005 10
(top_dept <- count(train, dept_id, sort = TRUE))
## # A tibble: 7 x 2
## dept_id n
## <chr> <int>
## 1 FOODS_3 8230
## 2 HOUSEHOLD_1 5320
## 3 HOUSEHOLD_2 5150
## 4 HOBBIES_1 4160
## 5 FOODS_2 3980
## 6 FOODS_1 2160
## 7 HOBBIES_2 1490
(top_cat <- count(train, cat_id, sort = TRUE))
## # A tibble: 3 x 2
## cat_id n
## <chr> <int>
## 1 FOODS 14370
## 2 HOUSEHOLD 10470
## 3 HOBBIES 5650
(top_store <- count(train, store_id, sort = TRUE))
## # A tibble: 10 x 2
## store_id n
## <chr> <int>
## 1 CA_1 3049
## 2 CA_2 3049
## 3 CA_3 3049
## 4 CA_4 3049
## 5 TX_1 3049
## 6 TX_2 3049
## 7 TX_3 3049
## 8 WI_1 3049
## 9 WI_2 3049
## 10 WI_3 3049
(top_state <- count(train, state_id, sort = TRUE))
## # A tibble: 3 x 2
## state_id n
## <chr> <int>
## 1 CA 12196
## 2 TX 9147
## 3 WI 9147
FOODS category and FOODS_3 department have the largest number of products.
ggplot(train, aes(x=dept_id)) +
geom_bar(fill='steelblue') +
ggtitle('Item Count by Department') +
theme(plot.title = element_text(hjust = 0.5))
ggplot(train, aes(x=cat_id)) +
geom_bar(fill='steelblue') +
ggtitle('Item Count by Category') +
theme(plot.title = element_text(hjust = 0.5))
There are 4 stores in CA, 3 in TX, and 3 in WI.
(store_state <- train %>% group_by(state_id) %>% summarize(unique_stores=n_distinct(store_id)))
## # A tibble: 3 x 2
## state_id unique_stores
## <chr> <int>
## 1 CA 4
## 2 TX 3
## 3 WI 3
ggplot(store_state, aes(state_id, unique_stores)) +
geom_col(fill='steelblue') +
ggtitle('Number of Stores by State') +
theme(plot.title = element_text(hjust = 0.5))
# read the data
train <- fread('data/train.csv')
test <- fread('data/test.csv')
head(train)
## V1 id item_id dept_id cat_id
## 1: 1 HOBBIES_1_001_CA_1_validation HOBBIES_1_001 HOBBIES_1 HOBBIES
## 2: 2 HOBBIES_1_002_CA_1_validation HOBBIES_1_002 HOBBIES_1 HOBBIES
## 3: 3 HOBBIES_1_003_CA_1_validation HOBBIES_1_003 HOBBIES_1 HOBBIES
## 4: 4 HOBBIES_1_004_CA_1_validation HOBBIES_1_004 HOBBIES_1 HOBBIES
## 5: 5 HOBBIES_1_005_CA_1_validation HOBBIES_1_005 HOBBIES_1 HOBBIES
## 6: 6 HOBBIES_1_006_CA_1_validation HOBBIES_1_006 HOBBIES_1 HOBBIES
## store_id state_id d sales date wm_yr_wk weekday wday month
## 1: CA_1 CA d_1 0 2011-01-29 11101 Saturday 1 1
## 2: CA_1 CA d_1 0 2011-01-29 11101 Saturday 1 1
## 3: CA_1 CA d_1 0 2011-01-29 11101 Saturday 1 1
## 4: CA_1 CA d_1 0 2011-01-29 11101 Saturday 1 1
## 5: CA_1 CA d_1 0 2011-01-29 11101 Saturday 1 1
## 6: CA_1 CA d_1 0 2011-01-29 11101 Saturday 1 1
## year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA
## 1: 2011 0
## 2: 2011 0
## 3: 2011 0
## 4: 2011 0
## 5: 2011 0
## 6: 2011 0
## snap_TX snap_WI sell_price
## 1: 0 0 NA
## 2: 0 0 NA
## 3: 0 0 NA
## 4: 0 0 NA
## 5: 0 0 NA
## 6: 0 0 NA
head(test)
## V1 id item_id dept_id cat_id
## 1: 1 HOBBIES_1_001_CA_1_validation HOBBIES_1_001 HOBBIES_1 HOBBIES
## 2: 2 HOBBIES_1_002_CA_1_validation HOBBIES_1_002 HOBBIES_1 HOBBIES
## 3: 3 HOBBIES_1_003_CA_1_validation HOBBIES_1_003 HOBBIES_1 HOBBIES
## 4: 4 HOBBIES_1_004_CA_1_validation HOBBIES_1_004 HOBBIES_1 HOBBIES
## 5: 5 HOBBIES_1_005_CA_1_validation HOBBIES_1_005 HOBBIES_1 HOBBIES
## 6: 6 HOBBIES_1_006_CA_1_validation HOBBIES_1_006 HOBBIES_1 HOBBIES
## store_id state_id d sales date wm_yr_wk weekday wday month
## 1: CA_1 CA d_1886 1 2016-03-28 11609 Monday 3 3
## 2: CA_1 CA d_1886 1 2016-03-28 11609 Monday 3 3
## 3: CA_1 CA d_1886 0 2016-03-28 11609 Monday 3 3
## 4: CA_1 CA d_1886 0 2016-03-28 11609 Monday 3 3
## 5: CA_1 CA d_1886 1 2016-03-28 11609 Monday 3 3
## 6: CA_1 CA d_1886 0 2016-03-28 11609 Monday 3 3
## year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA
## 1: 2016 NA NA NA NA 0
## 2: 2016 NA NA NA NA 0
## 3: 2016 NA NA NA NA 0
## 4: 2016 NA NA NA NA 0
## 5: 2016 NA NA NA NA 0
## 6: 2016 NA NA NA NA 0
## snap_TX snap_WI sell_price
## 1: 0 0 8.26
## 2: 0 0 3.97
## 3: 0 0 2.97
## 4: 0 0 4.64
## 5: 0 0 2.88
## 6: 0 0 0.96
We can see an upward trend from 2011 to 2016. We can also observe seasonality with annual peaks around September and October, and a dip on Christmas every year when all stores closed.
daily_sales_df <- train %>% group_by(date) %>% summarize(daily_sales=sum(sales))
daily_sales_df$date <- as.Date(daily_sales_df$date)
head(daily_sales_df)
## # A tibble: 6 x 2
## date daily_sales
## <date> <int>
## 1 2011-01-29 32631
## 2 2011-01-30 31749
## 3 2011-01-31 23783
## 4 2011-02-01 25412
## 5 2011-02-02 19146
## 6 2011-02-03 29211
fig <- daily_sales_df %>%
ggplot(aes(x=date, y=daily_sales, group=1)) +
geom_area(fill='#69b3a2', alpha=0.5) +
geom_line(color='#69b3a2') +
geom_smooth(method='lm', formula=y~x, se=FALSE, size=0.5, color='gray20') +
labs(x='Date', y='Sales', title='Daily Sales') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
There are 3049 distinct items in the dataset. We will randomly selected 20 items to plot here.
set.seed(42)
random_items <- sample(unique(train$item_id), 20)
item_sales_df <- train %>%
filter(item_id %in% random_items) %>%
group_by(date, item_id) %>%
summarize(item_sales=sum(sales))
item_sales_df$date <- as.Date(item_sales_df$date)
fig <- item_sales_df %>%
ggplot(aes(x=date, y=item_sales, col=item_id)) +
geom_line() +
labs(x='Date', y='Sales', title='Daily Sales by Item') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
dept_sales_df <- train %>% group_by(date, dept_id) %>% summarize(dept_sales=sum(sales))
dept_sales_df$date <- as.Date(dept_sales_df$date)
fig <- dept_sales_df %>%
ggplot(aes(x=date, y=dept_sales, col=dept_id)) +
geom_line() +
labs(x='Date', y='Sales', title='Daily Sales by Department') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
cat_sales_df <- train %>% group_by(date, cat_id) %>% summarize(cat_sales=sum(sales))
cat_sales_df$date <- as.Date(cat_sales_df$date)
fig <- cat_sales_df %>%
ggplot(aes(x=date, y=cat_sales, col=cat_id)) +
geom_line() +
labs(x='Date', y='Sales', title='Daily Sales by Product Category') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
store_sales_df <- train %>% group_by(date, store_id) %>% summarize(store_sales=sum(sales))
store_sales_df$date <- as.Date(store_sales_df$date)
fig <- store_sales_df %>%
ggplot(aes(x=date, y=store_sales, col=store_id)) +
geom_line() +
labs(x='Date', y='Sales', title='Daily Sales by Store') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
state_sales_df <- train %>% group_by(date, state_id) %>% summarize(state_sales=sum(sales))
state_sales_df$date <- as.Date(state_sales_df$date)
fig <- state_sales_df %>%
ggplot(aes(x=date, y=state_sales, col=state_id)) +
geom_line() +
labs(x='Date', y='Sales', title='Daily Sales by State') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
There are 3049 distinct items in the dataset. We will randomly selected 20 items to plot here.
set.seed(42)
random_items <- sample(unique(train$item_id), 20)
item_price_df <- train %>%
filter(item_id %in% random_items) %>%
group_by(date, item_id) %>%
summarize(item_price=mean(sell_price))
item_price_df$date <- as.Date(item_price_df$date)
fig <- item_price_df %>%
ggplot(aes(x=date, y=item_price, col=item_id)) +
geom_line() +
labs(x='Date', y='Price', title='Item Daily Average Price') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
dept_price_df <- train %>% group_by(date, dept_id) %>% summarize(dept_price=mean(sell_price,
na.rm=TRUE))
dept_price_df$date <- as.Date(dept_price_df$date)
fig <- dept_price_df %>%
ggplot(aes(x=date, y=dept_price, col=dept_id)) +
geom_line() +
labs(x='Date', y='Price', title='Department Daily Average Price') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
cat_price_df <- train %>% group_by(date, cat_id) %>% summarize(cat_price=mean(sell_price,
na.rm=TRUE))
cat_price_df$date <- as.Date(cat_price_df$date)
fig <- cat_price_df %>%
ggplot(aes(x=date, y=cat_price, col=cat_id)) +
geom_line() +
labs(x='Date', y='Price', title='Daily Average Price by Category') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
store_price_df <- train %>% group_by(date, store_id) %>% summarize(store_price=mean(sell_price,
na.rm=TRUE))
store_price_df$date <- as.Date(store_price_df$date)
fig <- store_price_df %>%
ggplot(aes(x=date, y=store_price, col=store_id)) +
geom_line() +
labs(x='Date', y='Price', title='Daily Average Price by Store') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)
state_price_df <- train %>% group_by(date, state_id) %>% summarize(state_price=mean(sell_price,
na.rm=TRUE))
state_price_df$date <- as.Date(state_price_df$date)
fig <- state_price_df %>%
ggplot(aes(x=date, y=state_price, col=state_id)) +
geom_line() +
labs(x='Date', y='Price', title='Daily Average Price by State') +
scale_x_date(date_breaks='3 month', date_labels='%b %y') +
theme_ipsum() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplotly(fig, dynamicTicks=TRUE)